#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
@date: 2011-11-16
@author: shell.xu
'''
import os, sys, time, getopt, traceback
import unicodedata, chardet, segment
from os import path

dbname = 'frq.db'

def walkdir(basepath, func, *params):
    for base, dirs, files in os.walk(basepath):
        for filename in files:
            try: func(path.join(base, filename), *params)
            except: traceback.print_exc()

def cutstr(sentence):
    ''' cutstr sentence : cut string and show out. '''
    cut = segment.get_cutter(dbname)
    segment.DynamicCutter.DEBUG = True
    print '|'.join(cut.parse(sentence.decode('utf-8')))

def cut(*filepathes):
    ''' cut filepath ... : cut a file and not print out. '''
    cut = segment.get_cutter(dbname)
    for fp in filepathes: list(cut.parsefile(fp))

def cutshow(*filepathes):
    ''' cutshow filepath ... : cut a file and print out. '''
    cut = segment.get_cutter(dbname)
    for fp in filepathes: print '|'.join(cut.parsefile(fp)).encode('utf-8')

def frqtrain(*filepathes):
    ''' frqtrain filepath ... : train frequency by files. '''
    stat = segment.StatCutter(segment.dictdb(dbname))
    cut = segment.StringCutter(stat)
    for fp in filepathes: cut.parsefile(filepath)
    stat.train()

def frqtrains(basepath):
    ''' frqtrains dirpath : train frequency by all files under dir. '''
    stat = segment.StatCutter(segment.dictdb(dbname))
    cut = segment.StringCutter(stat)
    walkdir(basepath, cut.paesefile)
    stat.train(True)

def newtrain(*filepathes):
    ''' newtrain filepath ... : train new words by files. '''
    new = segment.NewCutter(segment.dictdb(dbname))
    cut = segment.StringCutter(new)
    for fp in filepathes: cut.paesefile(fp)
    for word, frq in new.get_highfrq():
        print word.encode('utf-8'), frq

def newtrains(basepath):
    ''' newtrains dirpath : train new words by all files under dir. '''
    new = segment.NewCutter(segment.dictdb(dbname))
    cut = segment.StringCutter(new)
    walkdir(basepath, cut.paesefile)
    for word, frq in new.get_highfrq():
        print word.encode('utf-8'), frq

def frqfile(filepath, frq):
    print 'process', filepath
    data = segment.readfile_cd(filepath)
    for i in data:
        if i not in frq: frq[i] = 0
        frq[i] += 1

def frqstat(*filepathes):
    ''' frqstat filepath ... : statistics frequency of char in files. '''
    frq = {}
    for fp in filepathes: frqfile(fp, frq)
    for k, v in sorted(frq.items(), key = lambda x:x[1], reverse = True):
        if unicodedata.category(k) != 'Lo': continue
        print k.encode('utf-8'), v

def frqstats(basepath):
    ''' frqstats dirpath : statistics frequency in all files under dir. '''
    frq = {}
    walkdir(basepath, frqfile, frq)
    for k, v in sorted(frq.items(), key = lambda x:x[1], reverse = True):
        if unicodedata.category(k) != 'Lo': continue
        print k.encode('utf-8'), v

cmds = ['cutstr', 'cut', 'cutshow', 'frqtrain', 'frqtrains',
        'newtrain', 'newtrains', 'frqstat', 'frqstats']
def main():
    opts, args = getopt.getopt(sys.argv[1:], 'd:')
    for opt, val in opts:
        if opt == '-d':
            global dbname
            dbname = val
    if len(args) == 0:
        print '%s cmds params ...' % sys.argv[0]
        for name in cmds: print '%s:%s' % (name, eval(name).__doc__)
    else: eval(args[0])(*args[1:])

if __name__ == '__main__': main()
